
"""
Goal: Extract sentences which contains the word finance from google book ngram corpus
Contact: mjha@gsu.edu (author), manela@wustl.edu, hongyi.liu@wustl.edu
Notes: because of the size of the corpus, we suggest to run one language and one year at a time.
"""

#################### user defined
# input from bash
gzfiledir = ""
pickle_filedir = "" 

# lang_fin_tuple = [('ita', ['finanz']), ('fre', ['financ']), ('spa', ['finan']), 
#                   ('eng-gb', ['financ']), ('ger', ['finanz']),('rus', ['финан']),
#                   ('eng-us', ['financ']), ('chi-sim', ['金融', '金 融', '金_融'])]
lang_fin_tuple = [('eng-us', ['financ'])]
yr =  1870
    
############################### import
import pickle
import pandas as pd
import io
import re

import gzip
from datetime import datetime
import os.path


regex = re.compile("['\\\"»•;!#$%&()*+,-/:;<=>?@[\]^_`{|}~0123456789]") 

for tup in lang_fin_tuple:
    
    ########################## do we need to run for the year
    langcode = tup[0] # 'ger'
    fin_words = tup[1] # 'finanz'
    
    if os.path.exists(pickle_filedir + langcode + str(yr) + '.p'):
        print("pickle file exists")
        print(langcode)
        print(yr)
        continue #goes to the next value in the loop
        #sys.exit()
    
    ####################### pickle
    starttime = datetime.now()
    
    # Read data 
    fin_ngrams = pd.DataFrame(columns=['ngrams_temp']) #create empty dataframe
    try: 
        gz = gzip.open(os.path.join(gzfiledir, langcode + "-all-5gram-" + str(yr) + '_sample.gz'), 'rb')
    except:
        print("no gzfile")
        print(langcode)
        print(yr)
        continue
    
    f = io.BufferedReader(gz)
    for line in f:
        if any(w in line.decode() for w in fin_words):
            fin_ngrams = pd.concat([fin_ngrams, pd.DataFrame({'ngrams_temp': [line.decode()]})], ignore_index=True)
    
    gz.close()
    gztime = datetime.now()
    
    # Prepare
    fin_ngrams['ngrams'] = fin_ngrams.apply(lambda row: ' '.join(regex.sub('', row.ngrams_temp).split()), axis=1)
    fin_ngrams['counts'] = fin_ngrams.groupby('ngrams')['ngrams'].transform('count')
    fin_ngrams.sort_values('ngrams', inplace = True) #inplace changes the df
    fin_ngrams.drop_duplicates(subset="ngrams", keep='first', inplace=True)
    fin_ngrams.reset_index(drop=True, inplace=True)
    
    #write results
    fin_ngrams = fin_ngrams[["ngrams", "counts"]]
    pickle.dump(fin_ngrams, open(pickle_filedir + langcode + str(yr) + '_sample.p', "wb" ))

    #print
    print(langcode)
    print(yr)
    print(starttime)
    print(gztime)




